Preprocessing QC statistics ¶

Noam, July 2023¶

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
MOMAPS_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps'
MOMAPS_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(MOMAPS_DATA_HOME, "outputs/preprocessing/spd/logs/dNLS")
PLOT_PATH = os.path.join(MOMAPS_HOME, 'src', 'preprocessing', 'notebooks','figures','Neurons')
os.chdir(MOMAPS_HOME)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["image.cmap"] = "Set1"
from src.common.lib.preprocessing_utils import rescale_intensity
from src.common.lib.images_qc import *
import contextlib
import io
import matplotlib
import warnings
warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)
from src.common.lib.qc_config_tmp import *
from src.common.lib.image_sampling_utils import *
from matplotlib.colors import LinearSegmentedColormap
In [3]:
df = log_files_qc(LOGS_PATH)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch2
reading logs of batch4
reading logs of batch3
reading logs of batch5

Total of 4 files were read.
Before dup handeling  (100193, 22)
After duplication removal #1: (100193, 23)
After duplication removal #2: (100193, 23)
In [4]:
# choose batches
batches = [f'batch{i}' for i in range(2,6)]
batches
Out[4]:
['batch2', 'batch3', 'batch4', 'batch5']

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [5]:
root_directory_raw = os.path.join(MOMAPS_DATA_HOME, 'input', 'images', 'raw', 'SpinningDisk','deltaNLS_sort')

batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, dnls_panels, dnls_markers.copy(),PLOT_PATH, dnls_marker_info,
                                    dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
                                     batches=batches_raw, fig_width=5)
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  27000
========
batch3
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/WT/panelN
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/TDP43/panelN
No bad files are found.
Total Sites:  25800
========
batch4
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/WT/panelN
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/TDP43/panelN
No bad files are found.
Total Sites:  25500
========
batch5
Folder structure is valid.
No bad files are found.
Total Sites:  26897
========
====================

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [6]:
root_directory_proc = os.path.join(MOMAPS_DATA_HOME, 'input', 'images', 'processed', 'spd2',
                              'SpinningDisk','deltaNLS')
procs = run_validate_folder_structure(root_directory_proc, True, dnls_panels, dnls_markers,PLOT_PATH,dnls_marker_info,
                                    dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
                                     batches=batches, fig_width=5)
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  25586
========
batch3
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch3/WT/Untreated/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch3/TDP43/dox/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch3/TDP43/Untreated/TDP43N
No bad files are found.
Total Sites:  24336
========
batch4
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch4/WT/Untreated/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch4/TDP43/dox/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch4/TDP43/Untreated/TDP43N
No bad files are found.
Total Sites:  23638
========
batch5
Folder structure is valid.
No bad files are found.
Total Sites:  25527
========
====================

Difference between Raw and Processed¶

In [7]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=5)
batch2
========
batch3
========
batch4
========
batch5
========

Variance in each batch¶

In [8]:
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']:  
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, cond_count=2, rep_count=len(reps), 
                                       num_markers=len(dnls_markers))
    print(f'{batch} var: ',var)
batch2 var:  0.011521469031638067
batch3 var:  0.010357440534791738
batch4 var:  0.010663615289945838
batch5 var:  0.01020901013294272

filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [9]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, dnls_line_colors, dnls_panels)

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [10]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, dnls_line_colors, dnls_panels)

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least 85% of a cell that Cellpose detected.

In [11]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, dnls_line_colors, dnls_panels)

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [12]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling)

Numbers!¶

  1. Total number of tiles: for each condition, we can know how many tiles we have --> number of data points for the model to train and infer on --> number of points in UMAPs..
  2. Total number of whole cells: for each condtion, we can know how many whole cells we have
In [13]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count']
total_sum = calc_total_sums(df_target, df_dapi, stats)
    

for stat, name in zip(stats[:2], names):
    to_heatmap = total_sum.rename(columns={stat:'index'})
    plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                          xlabel = name, show_sum=True, figsize=(4,8))
In [14]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch2
count 156.000000 156.000000 156.000000 156.000000
mean 487.147436 4.871474 536.814103 1409.724359
std 152.615680 1.526157 174.064312 557.999637
min 211.000000 2.110000 224.000000 433.000000
25% 369.000000 3.690000 398.000000 873.500000
50% 456.500000 4.565000 496.000000 1410.000000
75% 613.250000 6.132500 687.500000 1871.750000
max 831.000000 8.310000 919.000000 2525.000000
sum 75995.000000 NaN 83743.000000 219917.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch3
count 150.000000 150.000000 150.000000 150.000000
mean 579.380000 5.793800 647.213333 1618.353333
std 211.183392 2.111834 245.000632 718.404077
min 123.000000 1.230000 131.000000 285.000000
25% 332.500000 3.325000 358.000000 746.750000
50% 673.000000 6.730000 756.500000 1941.000000
75% 759.500000 7.595000 849.500000 2233.250000
max 891.000000 8.910000 1006.000000 2647.000000
sum 86907.000000 NaN 97082.000000 242753.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch4
count 149.000000 149.000000 149.000000 149.000000
mean 528.865772 5.288658 586.469799 1501.711409
std 197.328908 1.973289 225.492451 669.566533
min 40.000000 0.400000 42.000000 81.000000
25% 350.000000 3.500000 398.000000 808.000000
50% 566.000000 5.660000 629.000000 1709.000000
75% 696.000000 6.960000 781.000000 2098.000000
max 887.000000 8.870000 993.000000 2613.000000
sum 78801.000000 NaN 87384.000000 223755.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch5
count 156.000000 156.000000 156.000000 156.000000
mean 554.634615 5.546346 619.596154 1550.474359
std 190.310852 1.903109 220.682834 659.156092
min 105.000000 1.050000 114.000000 249.000000
25% 368.250000 3.682500 396.000000 821.250000
50% 593.000000 5.930000 683.000000 1718.000000
75% 734.000000 7.340000 830.000000 2144.000000
max 839.000000 8.390000 955.000000 2503.000000
sum 86523.000000 NaN 96657.000000 241874.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 611.000000 611.000000 611.000000 611.000000
mean 537.194763 5.371948 597.162029 1519.310966
std 191.403650 1.914036 220.853208 655.868995
min 40.000000 0.400000 42.000000 81.000000
25% 358.000000 3.580000 386.500000 807.500000
50% 563.000000 5.630000 628.000000 1693.000000
75% 705.000000 7.050000 795.500000 2098.000000
max 891.000000 8.910000 1006.000000 2647.000000
sum 328226.000000 NaN 364866.000000 928299.000000
expected_count 450.000000 450.000000 450.000000 450.000000

Number of Cells in Site for each batch and cell line¶

In [15]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)')

plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site')

plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site')

number of valid tiles per image (site)¶

In [16]:
plot_catplot(df_dapi, custom_palette,reps, x='n_valid_tiles', x_title='valid tiles count', batch_min=2, batch_max=5)

Heatmap QC per batch, panel and cell line(tiles that passed QC condition) ¶

In [17]:
plot_hm(df_dapi, split_by='rep', rows='cell_line_cond', columns='panel')

Assessing Staining Reproducibility and Outliers¶

In [18]:
for batch in batches:
    print(batch)
    #batch_num = batch.replace('batch',"")
    run_calc_hist_new(f'deltaNLS_sort/{batch}', dnls_cell_lines_for_disp, dnls_markers, 
                           hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
    print("="*30)
batch2
/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps/src/common/lib/images_qc.py:914: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  plt.tight_layout()
/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/IPython/core/pylabtools.py:151: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  fig.canvas.print_figure(bytes_io, **kw)
==============================
batch3
==============================
batch4
==============================
batch5
==============================
In [20]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system('jupyter nbconvert --to html src/preprocessing/notebooks/no_ds/qc_report_newPP_dNLS.ipynb')
[NbConvertApp] Converting notebook src/preprocessing/notebooks/no_ds/qc_report_newPP_dNLS.ipynb to html
[NbConvertApp] Writing 16690053 bytes to src/preprocessing/notebooks/no_ds/qc_report_newPP_dNLS.html
Out[20]:
0
In [ ]: